import pandas as pd
import numpy as np
import scipy as sc
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import math
import sklearn
plt.rc("font", size=15)
pylab.rcParams['figure.figsize'] = 5, 5
plt.rc('xtick', color='k', labelsize='medium', direction='in')
plt.rc('xtick.major', size=8, pad=12)
plt.rc('xtick.minor', size=8, pad=12)
plt.rc('ytick', color='k', labelsize='medium', direction='in')
plt.rc('ytick.major', size=8, pad=12)
plt.rc('ytick.minor', size=8, pad=12)
This is a first step into using twitter to measure US population polarity toward politicians personalities.
To start working with the twitter API, some deep learning model like BERT and some cool data visualization in the aforementioned context, I decided to search from twitter, tweets having the handle @POTUS, everyday for a week, spanning as much US cities and states as possible. Once those tweets were collected, I ran a home trained fine tuned BERT sentiment classifier (positive, negative, neutral) to embbed those tweets in a sentiment space.
I wanted to see if states that were very similarly or oppositly polarized in their way of voting at the US election were also showing in average similar or opposite temporal sentiments trend.
Only plotted for two states but the code allow you to plot all
df_elections=pd.read_csv("1976-2020-president.csv")
states=df_elections['state_po'].unique()
years=df_elections['year'].unique()
dico_states={u:v for u,v in zip(list(df_elections['state']),list(df_elections['state_po']))}
from operator import itemgetter
dico_time_series={v:[]for v in states}
for v in states:
for u in years:#[-6:]:
to_max=[]
for i in df_elections.index:
if df_elections['state_po'][i]==v and df_elections['year'][i]==u:
to_max.append([df_elections['party_detailed'][i],df_elections['candidatevotes'][i],df_elections['totalvotes'][i]])
try:
polar=abs([to_max[p][1] for p in range(len(to_max)) if to_max[p][0]=='DEMOCRAT'][0]-[to_max[p][1] for p in range(len(to_max)) if to_max[p][0]=='REPUBLICAN'][0])/([to_max[p][1] for p in range(len(to_max)) if to_max[p][0]=='DEMOCRAT'][0]+[to_max[p][1] for p in range(len(to_max)) if to_max[p][0]=='REPUBLICAN'][0])
except:
polar=-1
dico_time_series[v].append(sorted(to_max,key=itemgetter(1),reverse=True)[0]+[polar])
dico_weigthed_average_elections={v:[] for v in dico_time_series.keys()}
for v in dico_time_series.keys():
color=np.mean([1 for t in dico_time_series[v][-6:] if t[0]=='DEMOCRAT' and t[-1]!=-1])
if color>=0.5:
dico_weigthed_average_elections[v].append(-1)
else:
dico_weigthed_average_elections[v].append(1)
sum_votes=np.sum([t[-2] for t in dico_time_series[v][-6:] if t[-1]!=-1])
#*t[-2]/sum_votes
dico_weigthed_average_elections[v].append(np.mean([t[-1]*t[-2]/sum_votes for t in dico_time_series[v][-6:] if t[-1]!=-1]))
/home/seb/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3372: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, /home/seb/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount)
Here polarity is defined as : $\frac{|n_{vote}(Democrate)-n_{vote}(Republican)|}{n_{vote}(Democrate)+n_{vote}(Republican)}$
import plotly.graph_objects as go
states=['california','texas']
df_election_timeseries = pd.read_csv()
fig=go.Figure()
for s in states:
x=[df_election_timeseries.year.iloc[i] for i in df_election_timeseries.index if df_election_timeseries.states.loc[i]==s and df_election_timeseries.polarity.iloc[i]!=-1]
y=[df_election_timeseries.polarity.iloc[i] for i in df_election_timeseries.index if df_election_timeseries.states.loc[i]==s and df_election_timeseries.polarity.iloc[i]!=-1]
df_election_timeseries_=pd.DataFrame([[x[i],y[i]] for i in range(len(x))],columns=['years','polarity'])
df_election_timeseries_["states"]=[df_election_timeseries.states.iloc[i] for i in df_election_timeseries.index if df_election_timeseries.states.loc[i]==s and df_election_timeseries.polarity.iloc[i]!=-1]
df_election_timeseries_["winner"]=[df_election_timeseries.winner.iloc[i] for i in df_election_timeseries.index if df_election_timeseries.states.loc[i]==s and df_election_timeseries.polarity.iloc[i]!=-1]
col=['blue' if df_election_timeseries.winner[i]=='DEMOCRAT' else 'red' for i in df_election_timeseries.index if df_election_timeseries.states.loc[i]==s and df_election_timeseries.polarity.iloc[i]!=-1]
size=[10 for i in df_election_timeseries.index if df_election_timeseries.states.loc[i]==s and df_election_timeseries.polarity.iloc[i]!=-1]
fig.add_trace(go.Scatter(x=df_election_timeseries_.years,y=df_election_timeseries_.polarity,
mode='markers+lines',name=s,marker=dict(size=size,
color=col),connectgaps=False,line=go.scatter.Line(color="gray"),hovertemplate =
'<i><b>Polarity<b></i>: %{y:.2f}'+
'<br><b>Years</b>: %{x}<br>'+
'<b>%{text}</b>',text = df_election_timeseries_.winner))
fig.update_layout(title='Polarity as the absolute normalized<br>difference of votes for Democrate and Republicans<br>per states per election years',
title_x=0.5,
xaxis_title='Election year',
yaxis_title='Polarity')
fig.show()
The weighted average is made on the total number of voters per year per states.
dico_state_fips={u:v for u,v in zip(list(df_elections['state_po']),list(df_elections['state_fips']))}
dico_states_reverse={dico_states[v]:v.lower() for v in dico_states.keys()}
df_for_map=pd.DataFrame([[v,[str(dico_state_fips[v]) if len(str(dico_state_fips[v]))>1 else '0'+str(dico_state_fips[v])][0],
' '.join([''.join([v[i] if i>0 else v[i].upper()for i in range(len(list(v)))])for v in dico_states_reverse[v].split(' ')]),
dico_weigthed_average_elections[v][0]*dico_weigthed_average_elections[v][1]]for v in dico_weigthed_average_elections.keys()]
,columns=['states_po','id','name','polarity'])
df_for_map["name"].iloc[8]='District of Columbia'
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json') as response:
states_json = json.load(response)
import plotly.express as px
fig = px.choropleth(df_for_map, geojson=states_json, locations='name',featureidkey='properties.name', color='polarity',
color_continuous_scale=['Blue','Red'],
range_color=(-0.04, 0.04),
scope="usa",
labels={'polarity':'weighted averaged<br> polarity since 2000'}
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
/home/seb/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
import os
list_files=[s for s in os.listdir('') if s.startswith("@POTUS")==True ]
list_files
['@POTUS_2021-02-04_eval.csv', '@POTUS_2021-01-30_eval.csv', '@POTUS_2021-02-02_eval.csv', '@POTUS_2021-02-01_eval.csv', '@POTUS_2021-02-03_eval.csv', '@POTUS_2021-01-31_eval.csv', '@POTUS_2021-01-29_eval.csv']
adresse=""
frames=[]
for l in list_files:
frames.append(pd.read_csv(adresse+l))
df_concat=pd.concat(frames)
df_concat.reset_index(drop=True, inplace=True)
days=sorted(list(set([s.split(" ")[0] for s in df_concat.time_created])))
df_cities_states=pd.read_csv("us_cities_states_counties.csv",sep='|')
dico_cities_states={c:s for c,s in zip(list(df_cities_states['City']),list(df_cities_states["State short"]))}
dico_tweet_state={s:[] for s in dico_states.values() if type(s)==str}
dico_state_taken={s:0 for s in set(dico_cities_states.values())}
ville=[]
for i in df_concat.index:
s=df_concat["user_location"].iloc[i]
#print(s)
split_s=s.split(',')
if len(split_s)>1:
s=split_s[1].strip("''").strip(" ").upper()
if len(s)>2:
try:
s=dico_states[s.upper()]
if split_s[0].strip("b'").strip("''") not in ville:
ville.append(split_s[0].strip("b'").strip("''"))
dico_state_taken[s]+=1
except:
None
else:
try:
s=dico_cities_states[s.strip("b'").strip("''")]
if split_s[0].strip("b'").strip("''") not in ville:
ville.append(split_s[0].strip("b'").strip("''"))
dico_state_taken[s]+=1
except:
None
if s in dico_tweet_state.keys():
dico_tweet_state[s].append([[df_concat["neutral/ambiguous"].iloc[i],
df_concat["positive"].iloc[i],
df_concat["negative"].iloc[i]],
df_concat["time_created"].iloc[i].split(' ')[0]])
for s in dico_tweet_state.keys():
dico_tweet_state[s]=sorted(dico_tweet_state[s],key=itemgetter(1))
matrix_counts=np.zeros((len(dico_tweet_state),len(days)),dtype='f')
dico_days={d:i for i,d in enumerate(days)}
dico_int_days={d:i for i,d in enumerate(list(dico_tweet_state.keys()))}
for s in dico_tweet_state.keys():
for i in range(len(dico_tweet_state[s])):
matrix_counts[dico_int_days[s]][dico_days[dico_tweet_state[s][i][1]]]+=1
I have written a script that daily requests tweets from 2 days ago (to be sure that each states have been through the whole day), having @POTUS in their text and that for 7768 cities in the US (20km radius around defining GPS coordinate). At the end of this 8 days of scrapping twiters, I was able to find only 1323 of those cities : which translates as follow in terms os sampling states. Quite uneven sampling.
df_for_map=pd.DataFrame([[v,[str(dico_state_fips[v]) if len(str(dico_state_fips[v]))>1 else '0'+str(dico_state_fips[v])][0],
' '.join([''.join([v[i] if i>0 else v[i].upper()for i in range(len(list(v)))])for v in dico_states_reverse[v].split(' ')]),
dico_weigthed_average_elections[v][0]*dico_weigthed_average_elections[v][1]]for v in dico_weigthed_average_elections.keys()]
,columns=['states_po','id','name','polarity'])
df_for_map["name"].iloc[8]='District of Columbia'
df_for_map['Number_of_cities']=[dico_state_taken[p] for p in df_for_map['states_po']]
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json') as response:
states_json = json.load(response)
import plotly.express as px
fig = px.choropleth(df_for_map, geojson=states_json, locations='name',featureidkey='properties.name', color='Number_of_cities',
scope="usa",
labels={'Number of city per states used for picking tweets'}
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
/home/seb/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
import plotly.graph_objects as go
from plotly.subplots import make_subplots
fig = make_subplots(
rows=2, cols=2,
column_widths=[3, 1],
row_heights=[2, 3],shared_yaxes=True,shared_xaxes=True
)
fig.add_trace(go.Heatmap(z=np.log10(matrix_counts),x=days,y=list(dico_tweet_state.keys()),colorbar={},colorscale='plasma',name='',hovertemplate =
'<i><b>Day<b></i>: %{x}'+
'<br><b>State</b>: %{y}<br>'+
'<b>Log10_Count</b>: %{z:.2f}'),row=2,col=1)
fig.add_trace(go.Bar(y=np.sum(matrix_counts,axis=0),name='',hovertemplate =
'<i><b>Count<b></i>: %{y:.2f}'+
'<br><b>%{text}</b>',text=days),row=1,col=1)
fig.add_trace(go.Bar(x=np.sum(matrix_counts,axis=1),orientation='h',name='',hovertemplate =
'<i><b>Count<b></i>: %{x:.2f}'+
'<br><b>%{text}</b>',text=list(dico_tweet_state.keys())),row=2,col=2)
fig.update_layout(showlegend = False)
fig.show()
<ipython-input-15-7860371bb301>:9: RuntimeWarning: divide by zero encountered in log10
So there is a big difference in sampling according to states and days of the week compared to week end.
kept_states=[list(dico_tweet_state.keys())[i]for i in range(matrix_counts.shape[0])
if min(matrix_counts[i][1:])>50]
print(len(kept_states),kept_states)
30 ['AL', 'AZ', 'AR', 'CA', 'CO', 'CT', 'FL', 'GA', 'IL', 'IN', 'KS', 'KY', 'MD', 'MA', 'MI', 'MN', 'MO', 'NJ', 'NY', 'NC', 'OH', 'OK', 'OR', 'PA', 'SC', 'TN', 'TX', 'VA', 'WA', 'WI']
I get rid of the states that don't have at least a representation of 50 tweets per day
days_kept=days[1:]
I get rid of the first day
mean_dico_tweet_state={s:{d:{e:0 for e in ["neutral/ambiguous","positive","negative"]} for d in days_kept} for s in kept_states if type(s)==str}#dico_states.values()
for s in mean_dico_tweet_state.keys():
for i in range(1,len(days),1):
#print([dico_tweet_state[s][k][0][0] for k in range(len(dico_tweet_state[s])) if dico_tweet_state[s][k][1]==days[i]])
mean_dico_tweet_state[s][days[i]]["neutral/ambiguous"]=np.mean([dico_tweet_state[s][k][0][0] for k in range(len(dico_tweet_state[s])) if dico_tweet_state[s][k][1]==days[i]])
mean_dico_tweet_state[s][days[i]]["positive"]=np.mean([dico_tweet_state[s][k][0][1] for k in range(len(dico_tweet_state[s])) if dico_tweet_state[s][k][1]==days[i]])
mean_dico_tweet_state[s][days[i]]["negative"]=np.mean([dico_tweet_state[s][k][0][2] for k in range(len(dico_tweet_state[s])) if dico_tweet_state[s][k][1]==days[i]])
The deep learning model gives you for each sentence its representation on space described by 3 axis : positive, negative, neutral. Every sentences is a normed vector in this space.
To describe the emotion per state I just average all the tweets from the same state, from the same day. I am thus able to reconstruct a time serie of tweeter emotion per states.
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
X=np.zeros((len(days_kept)*len(mean_dico_tweet_state),3))
o=0
na=[]
emotion_list=["neutral/ambiguous","positive","negative"]
for s in mean_dico_tweet_state.keys():
for d in days_kept:
na.append(s+'_'+d)
X[o][0]=mean_dico_tweet_state[s][d]['neutral/ambiguous']
X[o][1]=mean_dico_tweet_state[s][d]['positive']
X[o][2]=mean_dico_tweet_state[s][d]['negative']
o+=1
df_temp=pd.DataFrame([[X[i,0],X[i,1],X[i,2]] for i in range(len(X))],columns=emotion_list,index=na)
#scaler = StandardScaler() #define the scaler object
#scaler.fit(df_temp)
#scaled_data = scaler.transform(df_temp)
scaled_data = df_temp
pca = PCA()#create a PCA object
pca.fit(scaled_data)#calculate the eigenvalues and eigenvectors according to rescaled data
x_pca = pca.transform(scaled_data)# Calculate the new points coordinate according to the previously found eigenvectors.
var_explained=pca.explained_variance_ratio_
fig, ax = plt.subplots(figsize = (8,8))
ax.scatter(x_pca[:,0],x_pca[:,1])
feature_vectors = pca.components_.T
#Going from the feature base to the PCA base is easy : one is just the other transposed.
# we use scaling factors to make the arrows easier to see
arrow_size, text_pos = 0.1,0.1,
# projections of the original features
for i, v in enumerate(feature_vectors):
ax.arrow(0, 0, arrow_size*v[0], arrow_size*v[1],head_width=0.02, head_length=0.02, linewidth=2, color='red')
ax.text(v[0]*text_pos, v[1]*text_pos, df_temp.columns[i], color='black',ha='left', va='top', fontsize=18)
plt.title('Not Scaled')
plt.xlabel('First Principal Component,\n var explained = {0:.2f}'.format(var_explained[0]))
plt.ylabel('Second Principal Component,\n var explained = {0:.2f}'.format(var_explained[1]))
plt.show()
As expected since one dimension is constrained by the sum being one, the actual dimensionality is 2. Nicely, positive axis is mainly following first axis. Nicely enough the second axis diferentiates negative and neutral.
df_pca=pd.DataFrame([[x_pca[i][0],x_pca[i][1]] for i in range(x_pca.shape[0])],columns=["PCA1","PCA2"])
import plotly.express as px
dico_state_plority={s:p for s,p in zip(list(df_for_map.states_po),list(df_for_map.polarity))}
df_pca=pd.DataFrame([[x_pca[i][0],x_pca[i][1]] for i in range(x_pca.shape[0])],columns=["PCA1","PCA2"])
df_pca['name']=na
df_pca['polarity']=[dico_state_plority[n.split('_')[0]] for n in na]
col=['blue' if df_pca.polarity.iloc[i]<0 else 'red' for i in df_pca.index]
px.scatter(df_pca, hover_data=['name'],
x="PCA1", y="PCA2", color='polarity',color_continuous_scale=['Blue','Red'],
range_color=(-0.04, 0.04),)
No pattern noticeable
dico_state_plority={s:p for s,p in zip(list(df_for_map.states_po),list(df_for_map.polarity))}
from matplotlib.cm import get_cmap
from matplotlib.colors import PowerNorm
from matplotlib.colors import LinearSegmentedColormap,Normalize
colors = ['#0C4DFF','#ff0000']
line_cmap = LinearSegmentedColormap.from_list('my_cmap', colors)
line_norm = Normalize(vmin=-0.04,vmax=0.04)
df_comp = pd.DataFrame(pca.components_,columns=list(df_temp.columns))
# pca.components_ : recovering the matrix that describe the principal component in the former feature basis. It gives you the
# values of the coefficients in front of each features to build your PCA components.
plt.figure(figsize=(5,5))
sns.heatmap(df_comp,cmap='plasma', annot=True, annot_kws={"size": 10})
plt.yticks(np.arange(0+0.5,len(list(df_temp.columns))+0.5,1),['PCA axis '+str(i+1) for i in range(len(list(df_temp.columns)))],rotation=0)
plt.show()
We saw above that those PCA axis were more or less explained by emotions and this is just a matrix representation of it. I have it there so it is easier to go back to when trying to make sense of below graph.
df_temp_plus=df_temp
df_temp_plus["date"]=[i.split('_')[1] for i in df_temp_plus.index]
df_temp_plus["state"]=[i.split('_')[0] for i in df_temp_plus.index]
df_temp_plus["polarity"]=[dico_state_plority[i.split('_')[0]] for i in df_temp_plus.index]
df_temp_plus2=df_temp_plus
df_temp_plus2.reset_index(inplace=True)
states=['CA','TX']
fig=go.Figure()
for s in states:
x=[df_pca.PCA1.iloc[i] for i in df_temp_plus2.index if df_temp_plus2.state.loc[i]==s]
y=[df_pca.PCA2.iloc[i] for i in df_temp_plus2.index if df_temp_plus2.state.loc[i]==s ]
z=[df_temp_plus2.date.iloc[i] for i in df_temp_plus2.index if df_temp_plus2.state.loc[i]==s ]
df_=pd.DataFrame([[x[i],y[i],z[i]] for i in range(len(x))],columns=['positive','negative','date'])
df_["states"]=[df_temp_plus2.state.iloc[i] for i in df_temp_plus2.index if df_temp_plus2.state.loc[i]==s ]
col=[line_cmap(line_norm(dico_state_plority[s]))for i in range(len(df_))]
size=[10 for i in range(len(df_))]
fig.add_trace(go.Scatter3d(x=df_.positive, y=df_.negative,z=df_.date,mode='markers+lines',
marker=dict(size=size,
color=col),
name=s,connectgaps=False,hovertemplate =
'<i><b>PCA2<b></i>: %{y:.2f}<br>'+
'<i><b>Date<b></i>: %{z}'+
'<br><b>PCA1</b>: %{x:.2f}<br>'))
fig.update_layout(title='Time series of emotions (PCA projected) in tweets',
title_x=0.5,
xaxis_title='PCA1',
yaxis_title='PCA2')
fig.show()
In this visualization you can see some anti correlation!!
Distance correlation here is used to check time series correlation in multidimensional space. I understand most of it : except maybe why the unbiased estimator can be negative...
import dcor
i=0
dcor_matrix=np.zeros((len(list(mean_dico_tweet_state.keys())),len(list(mean_dico_tweet_state.keys()))),dtype='f')
dcor_matrix_pval=np.zeros((len(list(mean_dico_tweet_state.keys())),len(list(mean_dico_tweet_state.keys()))),dtype='f')
for s1 in list(mean_dico_tweet_state.keys()):
j=0
for s2 in list(mean_dico_tweet_state.keys()):
dcor_matrix[i][j]=dcor.u_distance_correlation_sqr(np.array([[mean_dico_tweet_state[s1][p]['positive'],mean_dico_tweet_state[s1][p]['negative'],mean_dico_tweet_state[s1][p]['neutral/ambiguous']]for p in mean_dico_tweet_state[s1].keys()]),
np.array([[mean_dico_tweet_state[s2][p]['positive'],mean_dico_tweet_state[s2][p]['negative'],mean_dico_tweet_state[s2][p]['neutral/ambiguous']]for p in mean_dico_tweet_state[s2].keys()]),exponent=0.5)
dcor_matrix_pval[i][j]=dcor.independence.distance_covariance_test(np.array([[mean_dico_tweet_state[s1][p]['positive'],mean_dico_tweet_state[s1][p]['negative'],mean_dico_tweet_state[s1][p]['neutral/ambiguous']]for p in mean_dico_tweet_state[s1].keys()]),
np.array([[mean_dico_tweet_state[s2][p]['positive'],mean_dico_tweet_state[s2][p]['negative'],mean_dico_tweet_state[s2][p]['neutral/ambiguous']]for p in mean_dico_tweet_state[s2].keys()]),exponent=0.5)[0]
j+=1
i+=1
plt.hist(dcor_matrix_pval.flatten(),100)
plt.xlabel('Distance correlation pval')
plt.show()
Unfortunately non of those distance correlations are significant so everything below is not significant. Still conceptually it is something we could think of for further studies.
plt.hist(dcor_matrix.flatten(),100)
plt.xlabel('Distance correlation t stat (?)')
plt.show()
import scipy.cluster.hierarchy as sch
def cluster_def(X):
''' Hierarcical clustering with median '''
D = np.copy(X)
Y = sch.linkage(D, method='ward')
Z = sch.dendrogram(Y, no_plot=True)
return Z['leaves'],Y
import scipy as sc
Z_exp,Y_exp=cluster_def(sc.spatial.distance.pdist(dcor_matrix))
cluster_exp=np.zeros(np.shape(dcor_matrix))
for i in range(cluster_exp.shape[0]):
for j in range(i+1,cluster_exp.shape[1]):
cluster_exp[i,j]=dcor_matrix[Z_exp[i],Z_exp[j]]
cluster_exp[j,i]=cluster_exp[i,j]
inv_Z_exp={y:i for i,y in enumerate(Z_exp)}
from matplotlib.cm import get_cmap
from matplotlib.colors import PowerNorm
from matplotlib.colors import LinearSegmentedColormap,Normalize
colors = ['#0C4DFF','#ff0000']
line_cmap = LinearSegmentedColormap.from_list('my_cmap', colors)
line_norm = Normalize(vmin=-0.04,vmax=0.04)
col_col=[line_cmap(line_norm(dico_state_plority[i])) for i in list(mean_dico_tweet_state.keys())]
a=sns.clustermap(dcor_matrix,z_score=None,row_cluster=True,col_cluster=True,method='ward',cmap='coolwarm',col_colors=col_col
,vmin=-1,vmax=1)
b=a.ax_heatmap
b.set_xticks(np.arange(0.5,len(Z_exp)+0.5,1))
b.set_xticklabels([dico_states_reverse[list(mean_dico_tweet_state.keys())[Z_exp[j]]] for j in range(len(Z_exp))],
fontsize=10,rotation=90)
b.set_yticks(np.arange(0.5,len(Z_exp)+0.5,1))
b.set_yticklabels([dico_states_reverse[list(mean_dico_tweet_state.keys())[Z_exp[j]]] for j in range(len(Z_exp))],
fontsize=10,rotation=0)
plt.show()
from sklearn.metrics import euclidean_distances
from sklearn import manifold
plt.figure(figsize=(5,5))
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=np.random.RandomState(seed=3),
dissimilarity="precomputed", n_jobs=1)#n_components to choose the dimension of the projected space
pos = mds.fit(1-abs(dcor_matrix)).embedding_#fit and transform the data in the reduced space
<Figure size 360x360 with 0 Axes>
df_MDS=pd.DataFrame([[pos[i][0],pos[i][1]] for i in range(pos.shape[0])],columns=["MDS axis 1","MDS axis 2"])
df_MDS['name']=list(mean_dico_tweet_state.keys())
df_MDS['polarity']=[dico_state_plority[n] for n in list(mean_dico_tweet_state.keys())]
col=['blue' if df_MDS.polarity.iloc[i]<0 else 'red' for i in df_MDS.index]
px.scatter(df_MDS, hover_data=['name'],
x="MDS axis 1", y="MDS axis 2", color='polarity',color_continuous_scale=['Blue','Red'],
range_color=(-0.04, 0.04),)